Global Deaths Due to Air Pollution

Elizabeth Bekele, Alison Cheek

2022-05-03

Introduction

Packages Required

Data Details

Import the deaths-due-to-air-pollution data

deaths_df <- data.frame(read.csv("death-rates-from-air-pollution.csv"))

We are going to rename a few of the columns and glimpse the data

colnames(deaths_df) <- c("country", "acronym", "year", "total_deaths", "indoor_deaths", "outdoor_deaths", "ozone_deaths")

glimpse(deaths_df)
## Rows: 6,468
## Columns: 7
## $ country        <chr> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanist~
## $ acronym        <chr> "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG", "AFG",~
## $ year           <int> 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1~
## $ total_deaths   <dbl> 299.4773, 291.2780, 278.9631, 278.7908, 287.1629, 288.0~
## $ indoor_deaths  <dbl> 250.3629, 242.5751, 232.0439, 231.6481, 238.8372, 239.9~
## $ outdoor_deaths <dbl> 46.44659, 46.03384, 44.24377, 44.44015, 45.59433, 45.36~
## $ ozone_deaths   <dbl> 5.616442, 5.603960, 5.611822, 5.655266, 5.718922, 5.739~

Variables that interest us here include: country, total_deaths, indoor_deaths, outdoor_deaths, ozone_deaths

Now, we are going to import the world-popultaion data and glimpse

world_pop <- read.csv("population_total_long.csv")
glimpse(world_pop)
## Rows: 12,595
## Columns: 3
## $ Country.Name <chr> "Aruba", "Afghanistan", "Angola", "Albania", "Andorra", "~
## $ Year         <int> 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 1960, 196~
## $ Count        <int> 54211, 8996973, 5454933, 1608800, 13411, 92418, 20481779,~

To get a general idea of ‘deaths-dataframe’ we made, let’s make a plots to see what’s happening. This is a plot of indoor x outdoor deaths around the world by country.

d <- ggplot(deaths_df, aes(x = indoor_deaths, y = outdoor_deaths, text = paste0(country, ", ", year) )) + geom_point()
ggplotly(d)

This is a mess, and so we chose two countries from each continent (a high-population and a low-population country) to graph.

Exploratory Data Analysis

First, we split the data into high and low population based on country

+Low population = high population * .10

#selecting high-population countries from the world population data frame 
high_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(Year)

high_pop_countries
## # A tibble: 126 x 3
## # Groups:   Year [21]
##    Country.Name   Year     Count
##    <chr>         <int>     <int>
##  1 Australia      1997  18517000
##  2 Brazil         1997 167209040
##  3 Germany        1997  82034771
##  4 Nigeria        1997 113457663
##  5 Pakistan       1997 131057431
##  6 United States  1997 272657000
##  7 Australia      1998  18711000
##  8 Brazil         1998 169785250
##  9 Germany        1998  82047195
## 10 Nigeria        1998 116319759
## # ... with 116 more rows
#selecting low-population countries from the world population data frame 
low_pop_countries <- world_pop %>% 
  filter(Year > 1996 & Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand'))%>% 
  group_by(Year)

low_pop_countries
## # A tibble: 126 x 3
## # Groups:   Year [21]
##    Country.Name  Year    Count
##    <chr>        <int>    <int>
##  1 Canada        1997 29905948
##  2 Chile         1997 14786220
##  3 Sri Lanka     1997 18470900
##  4 Malawi        1997 10264906
##  5 New Zealand   1997  3781300
##  6 Serbia        1997  7596501
##  7 Canada        1998 30155173
##  8 Chile         1998 14977733
##  9 Sri Lanka     1998 18564599
## 10 Malawi        1998 10552338
## # ... with 116 more rows
#Mean total deaths from 1990-2017 of high-population countries
deaths_highpop_countries <- deaths_df %>% 
  filter(country %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(country) %>% 
  select(total_deaths) %>% 
  summarize(average_death_high = mean(total_deaths))
## Adding missing grouping variables: `country`
#deaths_highpop_countries


#Mean total deaths from 1990-2017 of high-population countries
deaths_lowpop_countries<- deaths_df %>% 
  filter(country %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>% 
  group_by(country) %>% 
  select(total_deaths) %>% 
  summarize(average_death_low = mean(total_deaths))
## Adding missing grouping variables: `country`
#death_lowpop_countries
kable(list(deaths_highpop_countries, deaths_lowpop_countries))
country average_death_high
Australia 17.76815
Brazil 48.42928
Germany 28.10988
Nigeria 112.30157
Pakistan 144.33463
United States 26.35827
country average_death_low
Canada 18.18542
Chile 36.51321
Malawi 147.77167
New Zealand 15.92536
Serbia 80.66558
Sri Lanka 69.60383
kable(deaths_highpop_countries) %>% 
  kable_styling(full_width = FALSE, position = 'float_left')
country average_death_high
Australia 17.76815
Brazil 48.42928
Germany 28.10988
Nigeria 112.30157
Pakistan 144.33463
United States 26.35827
kable(deaths_lowpop_countries) %>% 
  kable_styling(full_width = FALSE, position = 'left')
country average_death_low
Canada 18.18542
Chile 36.51321
Malawi 147.77167
New Zealand 15.92536
Serbia 80.66558
Sri Lanka 69.60383
ggplot(deaths_highpop_countries)+
  geom_col(mapping = aes(x=country, y=average_death_high))+
             xlab("Country")+
             ylab("Average deaths (per 100,000)")+
             ggtitle("Average total deaths in high-population countries")+
  coord_flip()

ggplot(deaths_lowpop_countries)+
  geom_col(mapping = aes(x=country, y=average_death_low))+
             xlab("Country")+
             ylab("Average deaths (per 100,000)")+
             ggtitle("Average total deaths in low-population countries")+
  coord_flip()

This shows us the deaths due to pollution, but what about the average population of those countries at that time?

hp_countries_population <- world_pop %>% 
  filter(Country.Name %in% c('United States', 'Brazil', 'Nigeria', 'Germany', 'Pakistan', 'Australia')) %>% 
  group_by(Country.Name) %>% 
  select(Count) %>% 
  summarize(average_population = mean(Count))
## Adding missing grouping variables: `Country.Name`
#hp_countries_population

lp_countries_population <- world_pop %>% 
  filter(Country.Name %in% c('Canada', 'Chile', 'Malawi', 'Serbia', 'Sri Lanka', 'New Zealand')) %>% 
  group_by(Country.Name) %>% 
  select(Count) %>% 
  summarize(average_population = mean(Count))
## Adding missing grouping variables: `Country.Name`
#lp_countries_population

kable(list(hp_countries_population, lp_countries_population))
Country.Name average_population
Australia 16825202
Brazil 142954579
Germany 79431999
Nigeria 99902237
Pakistan 110976559
United States 251243424
Country.Name average_population
Canada 27180819
Chile 13125858
Malawi 8997793
New Zealand 3477179
Serbia 7420310
Sri Lanka 16356619
kable(hp_countries_population) %>% 
  kable_styling(full_width = FALSE, position = 'float_left')
Country.Name average_population
Australia 16825202
Brazil 142954579
Germany 79431999
Nigeria 99902237
Pakistan 110976559
United States 251243424
kable(lp_countries_population) %>% 
  kable_styling(full_width = FALSE, position = 'left')
Country.Name average_population
Canada 27180819
Chile 13125858
Malawi 8997793
New Zealand 3477179
Serbia 7420310
Sri Lanka 16356619
ggplot(hp_countries_population)+
  geom_col(mapping = aes(x=Country.Name, y=average_population))+
             xlab("Country")+
             ylab("Average Population")+
             ggtitle("Average high-population countries")+
  coord_flip()

ggplot(lp_countries_population)+
  geom_col(mapping = aes(x=Country.Name, y=average_population))+
             xlab("Country")+
             ylab("Average Population")+
             ggtitle("Average low-population countries")+
  coord_flip()

#Join the data sets so we can overlay the two graph or do a stacked barchart?

Summary